rm(list=ls())
library(survival)
library(survminer)
library(preprocessCore)
library(plyr)
library(ggpubr)

projects = TCGAbiolinks:::getGDCprojects()$project_id
ind = grep("TCGA",projects)
projects=projects[ind]

path = "Z:/Bioinformatics/ExternalDatabases/TCGAbiolinksAnalysis/UnnormalizedData/"
stor=c()
Gene = c("COL18A1","COL16A1","COL14A1","COL10A1","COL9A3","COL9A2","COL8A1","COL6A5","COL6A2","COL6A1","COL4A6","COL4A3","COL4A1","COL3A1","COL2A1","COL27A1","COL26A1","COL25A1","COl20A1","COL1A2","COL1A1")
#Gene="LAIR1"

for(project in projects){
  datPath = paste(path,paste(project,"Data.csv",sep=""),sep="/")
  metDatPath = paste(path,paste(project,"Metadata.csv",sep=""),sep="/")
  metadata = read.csv(metDatPath)
  dat = read.csv(datPath)

  ind = which(metadata$tissue.definition=="Primary solid Tumor")
  tumor = metadata$cases[ind]
  tumor = unlist(lapply(tumor,function(x){aa=gsub("-","\\.",x);return(aa)}))
  genes = apply(as.matrix(dat$X),1,function(x){aa=unlist(strsplit(x,"\\|"));return(aa[1])})
  ind = which(genes%in%Gene)
  tumorDat = dat[ind,]
  genes = apply(as.matrix(tumorDat$X),1,function(x){aa=unlist(strsplit(x,"\\|"));return(aa[1])})
  tumorDat$X = NULL
  tumorDat = t(tumorDat)
  colnames(tumorDat) = genes
  tumorDat = data.frame(tumorDat)
  tumorDat$proj = project
  
  stor=rbind(stor,tumorDat)

}

numericDat = stor
numericDat <- data.frame(apply(numericDat, 2, function(x) as.numeric(as.character(x))))
numericDat$proj = stor$proj
numericDat[,1:(ncol(numericDat)-1)] = log2(numericDat[,2:(ncol(numericDat)-1)]+1)

AvgCollagenExpression = data.frame(apply(as.matrix(numericDat[,1:(ncol(numericDat)-1)]),1,function(x){mn = mean(x,na.rm = T);return(mn)}))
AvgCollagenExpression$project = stor$proj
AvgCollagenExpression$AvgCollExp = AvgCollagenExpression$apply.as.matrix.numericDat...1..ncol.numericDat....1.....1..function.x...
AvgCollagenExpression$apply.as.matrix.numericDat...1..ncol.numericDat....1.....1..function.x...=NULL
AvgCollagenExpression$patIds = row.names(stor)





idsMod = unlist(lapply(as.vector(AvgCollagenExpression$patIds), function(x) {
  aa = unlist(strsplit(x, "\\."))
  bb = paste(aa[1], aa[2], aa[3], sep = "-")
  return(bb)
}))
AvgCollagenExpression$modPatIds = idsMod


plts <- vector("list", length(projects))
counter = 1
relevantProj=c()

for(project in projects){
  ind = which(AvgCollagenExpression$project==project)
  AvgCollagenExpression_now = AvgCollagenExpression[ind,]
  
  
  
  
  
  #PROCESS THE CLINICAL DATA.....
  
  clinicalData = read.csv(paste(path, project, "Clinical.csv", sep = ""))
  
  ii = grep("bcr_patient_barcode", colnames(clinicalData))
  
  clinIDs = toupper(clinicalData$bcr_patient_barcode)
  idsMod = unlist(lapply(as.vector(AvgCollagenExpression$patIds), function(x) {
    aa = unlist(strsplit(x, "\\."))
    bb = paste(aa[1], aa[2], aa[3], sep = "-")
    return(bb)
  }))
  
  #z_n = apply(as.matrix(numDat),2,function(x){mn = mean(x,na.rm = T);std = sd(x,na.rm = T);aa=((x-mn)/std);return(aa)})
  #colnames(z_n)=idsMod[2:length(idsMod)]
  
  
  #New tumor event after initial treatment
  ind_keep <-
    grep('days_to_new_tumor_event_after_initial_treatment',
         colnames(clinicalData))
  new_tum <- as.matrix(clinicalData[, ind_keep])
  new_tum_collapsed <- c()
  for (i in 1:dim(new_tum)[1]) {
    if (sum (is.na(new_tum[i, ])) < dim(new_tum)[2]) {
      m <- min(new_tum[i, ], na.rm = T)
      new_tum_collapsed <- c(new_tum_collapsed, m)
    } else {
      new_tum_collapsed <- c(new_tum_collapsed, 'NA')
    }
  }
  
  # do the same to death
  ind_keep <- grep('days_to_death', colnames(clinicalData))
  death <- as.matrix(clinicalData[, ind_keep])
  death_collapsed <- c()
  for (i in 1:dim(death)[1]) {
    if (sum (is.na(death[i, ])) < dim(death)[2]) {
      m <- max(death[i, ], na.rm = T)
      death_collapsed <- c(death_collapsed, m)
    } else {
      death_collapsed <- c(death_collapsed, 'NA')
    }
  }
  
  # and days last follow up here we take the most recent which is the max number
  ind_keep <- grep('days_to_last_follow_up', colnames(clinicalData))
  fl <- as.matrix(clinicalData[, ind_keep])
  fl_collapsed <- c()
  for (i in 1:dim(fl)[1]) {
    if (sum (is.na(fl[i, ])) < dim(fl)[2]) {
      m <- max(fl[i, ], na.rm = T)
      fl_collapsed <- c(fl_collapsed, m)
    } else {
      fl_collapsed <- c(fl_collapsed, 'NA')
    }
  }
  
  
  # and put everything together
  all_clin <-
    data.frame(new_tum_collapsed, death_collapsed, fl_collapsed)
  colnames(all_clin) <-
    c('new_tumor_days', 'death_days', 'followUp_days')
  
  
  # create vector with time to new tumor containing data to censor for new_tumor
  all_clin$new_time <- c()
  for (i in 1:length(as.numeric(as.character(all_clin$new_tumor_days)))) {
    all_clin$new_time[i] <-
      ifelse (is.na(as.numeric(
        as.character(all_clin$new_tumor_days)
      )[i]),
      as.numeric(as.character(all_clin$followUp_days))[i],
      as.numeric(as.character(all_clin$new_tumor_days))[i])
  }
  
  # create vector time to death containing values to censor for death
  all_clin$new_death <- c()
  for (i in 1:length(as.numeric(as.character(all_clin$death_days)))) {
    all_clin$new_death[i] <-
      ifelse (is.na(as.numeric(as.character(
        all_clin$death_days
      ))[i]),
      as.numeric(as.character(all_clin$followUp_days))[i],
      as.numeric(as.character(all_clin$death_days))[i])
  }
  
  # create vector for death censoring
  table(clinicalData$vital_status)
  
  
  all_clin$death_event <-
    ifelse(clinicalData$vital_status == 'Alive', 0, 1)
  
  #finally add row.names to clinical
  rownames(all_clin) <- toupper(clinicalData$bcr_patient_barcode)
  all_clin$Age = clinicalData$age_at_index
  
  
  
  all_clin$modPatIds = row.names(all_clin)
  AvgCollagenExpression_now = join(AvgCollagenExpression_now,all_clin, by="modPatIds")
  
  ind = which(is.na(AvgCollagenExpression_now$new_death))
  if(length(ind)>0){
    AvgCollagenExpression_now = AvgCollagenExpression_now[-ind,]
  }
  
  
  
  
  quan = quantile(AvgCollagenExpression_now$AvgCollExp)
  ind = which(AvgCollagenExpression_now$AvgCollExp <= quan[2])
  ind1 = which(AvgCollagenExpression_now$AvgCollExp >= quan[4])
  
  AvgCollagenExpression_now = AvgCollagenExpression_now[c(ind, ind1), ]
  event_rna <-
    ifelse(AvgCollagenExpression_now$AvgCollExp >= quan[4],
           "HighExpression",
           "LowExpression")
  
  cox.ph <-
    coxph(Surv((AvgCollagenExpression_now$new_death / 30), AvgCollagenExpression_now$death_event) ~ event_rna, data = AvgCollagenExpression_now)
  coeffs = coef(summary(cox.ph))
  
  txt = paste(
    paste("HR(high):", round(coeffs[2], digits = 2), sep = ""),
    paste("pr(HR):", round(coeffs[5], digits = 2), sep = ""),
    paste("n(high):", length(ind1), sep = ""),
    paste("n(low):", length(ind), sep = ""),
    sep = ' '
  )
  
  fit <-
    survfit(Surv((AvgCollagenExpression_now$new_death / 30), AvgCollagenExpression_now$death_event) ~ event_rna, data = AvgCollagenExpression_now)
  x_q = quantile(na.exclude((AvgCollagenExpression_now$new_death / 30)))
  pval = surv_pvalue(fit)$pval
  #pval = substr(pval,1,(nchar(pval)-1))
  gp = ggsurvplot(fit, data = AvgCollagenExpression_now, pval = T)
  gp$plot = gp$plot +theme(axis.title.x=element_blank(),axis.title.y=element_blank(),legend.position="none")
  #gp$plot= gp$plot + annotate("text",x = round((x_q[4] + x_q[5]) / 2),y = 0.8,label = txt)
  if(pval<0.06){
    plts[counter] = gp
    relevantProj = c(relevantProj,project)
    counter= counter+1
  }
  
}

plts = plts[1:length(relevantProj)]

relevantProj = gsub("TCGA-","",relevantProj)

figure <- ggarrange(plotlist=plts,ncol = 3, nrow = 3,labels=relevantProj,font.label = list(size = 30, color = "black"),label.y = 0.5,label.x= 0.5)

# Annotate the figure by adding a common labels




png("U:/NC410Manuscript/Results/new_SurvivalAverageCollagenExpression2.png", width = 12, height = 6, units = 'in', res = 300)
annotate_figure(figure,
                bottom = text_grob("Time in Months", color = "black",size = 20,face = "bold"),
                left = text_grob("Survival Probability", color = "black", rot = 90,size=20,face = "bold")
)
dev.off()


